Hi. This is Chandrasekhar and here is my solution to the pml assignment.
We first load the data, remove those columns that have too many null values and then just find correlations with the data that is left and the final classe variables.
dftr <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dft <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
intr <- createDataPartition(dftr$classe, p = 0.7, list = F)
dfv <- dftr[-intr,]
dftr <- dftr[intr,]
bm <- sapply(select(dftr,names(dftr)[grepl("_belt",names(dftr))]),
function(x) sum(is.na(x)))
am <- sapply(select(dftr,names(dftr)[grepl("_arm",names(dftr))]),
function(x) sum(is.na(x)))
fm <- sapply(select(dftr,
names(dftr)[grepl("_forearm",names(dftr))]),
function(x) sum(is.na(x)))
dm <- sapply(select(dftr,
names(dftr)[grepl("_dumbbell",names(dftr))]),
function(x) sum(is.na(x)))
c2d <- c(names(bm[bm != 0]),
names(am[am != 0]),
names(fm[fm != 0]),
names(dm[dm != 0]))
dfa <- tbl_df(dftr %>%
select(-c2d,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(c2d)` instead of `c2d` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dfa$classe <- as.factor(dfa$classe)
dfa[,1:52] <- lapply(dfa[,1:52],as.numeric)
cc <- cor(select(dfa, -classe))
diag(cc) <- 0
cc <- which(abs(cc)>0.8,arr.ind = T)
cc <- unique(row.names(cc))
corrplot(cor(select(dfa,cc)),
type="lower", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(cc)` instead of `cc` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

cfd <- dfa %>% binarize(n_bins = 4, thresh_infreq = 0.01)
coa <- cfd %>% correlate(target = classe__A)
coa %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
cob <- cfd %>% correlate(target = classe__B)
cob %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
coc <- cfd %>% correlate(target = classe__C)
coc %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
cod <- cfd %>% correlate(target = classe__D)
cod %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Let us try to plot the pairs…
acol <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y",
"roll_forearm", "gyros_dumbbell_y")
bcol <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" ,
"magnet_belt_y" , "accel_dumbbell_x" )
ccol <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" ,
"magnet_dumbbell_x", "magnet_dumbbell_z")
dcol <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
"accel_dumbbell_y", "accel_forearm_x")
ecol <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt",
"gyros_belt_z" , "magnet_dumbbell_y")
fic <- character()
for(c in c(acol,bcol,ccol,dcol,ecol)){
fic <- union(fic, c)
}
dfa_2 <- dfa %>% select(fic, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(fic)` instead of `fic` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",fic)),
"forearm" = sum(grepl("_forearm",fic)),
"belt" = sum(grepl("_belt",fic)),
"dumbbell" = sum(grepl("_dumbbell",fic)))
demy <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2")
}
pomy <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2")
}
ggpairs(dfa_2, columns = 1:5,aes(color = classe),
lower = list(continuous = pomy),diag = list(continuous = demy))

ggpairs(dfa_2, columns = 6:10,aes(color = classe),
lower = list(continuous = pomy),diag = list(continuous = demy))

ggpairs(dfa_2, columns = 11:17,aes(color = classe),
lower = list(continuous = pomy),diag = list(continuous = demy))

dftrF <- dftr %>% select(fic,classe)
dfvF <- dfv %>% select(fic,classe)
dftrF[,1:17] <- sapply(dftrF[,1:17],as.numeric)
dfvF[,1:17] <- sapply(dfvF[,1:17],as.numeric)
lvs <- c("A", "B", "C", "D", "E")
ppo <- preProcess(dftrF[,-18],method = c("center","scale","BoxCox"))
xt <- predict(ppo,select(dftrF,-classe))
yt <- factor(dftrF$classe,levels=lvs)
xv <- predict(ppo,select(dfvF,-classe))
yv <- factor(dfvF$classe,levels=lvs)
ctrlt <- trainControl(method="cv", number=5)
CTm <- train(x = xt,y = yt,
method = "rpart", trControl = ctrlt)
RFm <- train(x = xt,y = yt,
method = "rf", trControl = ctrlt,verbose=FALSE, metric = "Accuracy")
GBMm <- train(x = xt,y = yt,
method = "gbm",trControl=ctrlt, verbose=FALSE)
SVMm <- svm(x = xt,y = yt,
kernel = "polynomial", cost = 10)
confusionMatrix(predict(CTm,xv),yv)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1515 473 486 420 144
## B 30 383 36 179 161
## C 128 283 504 365 291
## D 0 0 0 0 0
## E 1 0 0 0 486
##
## Overall Statistics
##
## Accuracy : 0.4907
## 95% CI : (0.4779, 0.5036)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3347
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9050 0.33626 0.49123 0.0000 0.44917
## Specificity 0.6383 0.91445 0.78041 1.0000 0.99979
## Pos Pred Value 0.4987 0.48542 0.32081 NaN 0.99795
## Neg Pred Value 0.9442 0.85165 0.87900 0.8362 0.88959
## Prevalence 0.2845 0.19354 0.17434 0.1638 0.18386
## Detection Rate 0.2574 0.06508 0.08564 0.0000 0.08258
## Detection Prevalence 0.5162 0.13407 0.26695 0.0000 0.08275
## Balanced Accuracy 0.7717 0.62536 0.63582 0.5000 0.72448
confusionMatrix(predict(RFm,xv),yv)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1669 13 0 2 0
## B 5 1107 21 2 0
## C 0 16 999 26 0
## D 0 3 6 934 3
## E 0 0 0 0 1079
##
## Overall Statistics
##
## Accuracy : 0.9835
## 95% CI : (0.9799, 0.9866)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9791
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9970 0.9719 0.9737 0.9689 0.9972
## Specificity 0.9964 0.9941 0.9914 0.9976 1.0000
## Pos Pred Value 0.9911 0.9753 0.9597 0.9873 1.0000
## Neg Pred Value 0.9988 0.9933 0.9944 0.9939 0.9994
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2836 0.1881 0.1698 0.1587 0.1833
## Detection Prevalence 0.2862 0.1929 0.1769 0.1607 0.1833
## Balanced Accuracy 0.9967 0.9830 0.9825 0.9832 0.9986
plot(RFm$finalModel,main="Error Graph")

confusionMatrix(predict(GBMm,xv),yv)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1635 49 0 3 4
## B 22 972 50 25 30
## C 11 65 942 64 22
## D 3 52 33 864 18
## E 3 1 1 8 1008
##
## Overall Statistics
##
## Accuracy : 0.9212
## 95% CI : (0.914, 0.9279)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9003
##
## Mcnemar's Test P-Value : 1.428e-15
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9767 0.8534 0.9181 0.8963 0.9316
## Specificity 0.9867 0.9732 0.9667 0.9785 0.9973
## Pos Pred Value 0.9669 0.8844 0.8533 0.8907 0.9873
## Neg Pred Value 0.9907 0.9651 0.9824 0.9797 0.9848
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2778 0.1652 0.1601 0.1468 0.1713
## Detection Prevalence 0.2873 0.1867 0.1876 0.1648 0.1735
## Balanced Accuracy 0.9817 0.9133 0.9424 0.9374 0.9645
confusionMatrix(predict(SVMm,xv),yv)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1652 65 22 17 2
## B 6 1005 28 4 10
## C 10 60 960 68 10
## D 6 8 12 872 30
## E 0 1 4 3 1030
##
## Overall Statistics
##
## Accuracy : 0.9378
## 95% CI : (0.9313, 0.9438)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9212
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9869 0.8824 0.9357 0.9046 0.9519
## Specificity 0.9748 0.9899 0.9695 0.9886 0.9983
## Pos Pred Value 0.9397 0.9544 0.8664 0.9397 0.9923
## Neg Pred Value 0.9947 0.9723 0.9862 0.9814 0.9893
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2807 0.1708 0.1631 0.1482 0.1750
## Detection Prevalence 0.2987 0.1789 0.1883 0.1577 0.1764
## Balanced Accuracy 0.9808 0.9361 0.9526 0.9466 0.9751
newdft <- dft %>% select(fic,problem_id)
xTest <- newdft %>% select(fic)
finalr <- data.frame("problem_id" = dft$problem_id,
"RF" = predict(RFm,xTest),
"GBM" = predict(GBMm,xTest),
"SVM" = predict(SVMm,xTest))
finalr